In [21]:
import numpy as np
import shapefile as shp
import pandas as pd
from pandas.api.types import CategoricalDtype

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import zipfile
import os

from ds100_utils import run_linear_regression_test

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

from IPython.display import Image


# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12
In [22]:
#first we loaded the Cook County training data. We used the training data over the sale data
#because the training data includes Sale Prices. We wanted to map the most expensive homes and
#least expensive homes in the county on a map of Cook County with the historical redlining
#districts to see if there was any correlation.
with zipfile.ZipFile('cook_county_data.zip') as item:
    item.extractall()
In [23]:
cook_county = pd.read_csv("cook_county_train.csv", index_col='Unnamed: 0')
In [24]:
#Looking at the 300 most expensive homes in the county. We picked the top 300 homes because
#there seems to be a huge jump and range in sale prices among the top 300. When we tried to 
#use a sample size greater than 300, our map started to make incoherent patterns.
cook_county_top300=cook_county.sort_values('Sale Price', ascending=False).head(300)
cook_county_top300['Sale Price']
Out[24]:
106057    71000000
71953     28000000
37835     12700000
929       12250000
19702     12000000
            ...   
103193     2840000
64193      2840000
140896     2837500
50825      2830000
167480     2825000
Name: Sale Price, Length: 300, dtype: int64
In [25]:
geometry=[Point(xy) for xy in zip(cook_county_top300.Longitude, cook_county_top300.Latitude)]
In [26]:
cook_county_top300['geometry'] = geometry
In [27]:
#Incorporated the shape file from Mapping Inequality (https://dsl.richmond.edu/panorama/redlining/#loc=11/41.641/-87.733)
#This loaded a base map.
#Used this resource to learn how to map onto shape files: https://towardsdatascience.com/geopandas-101-plot-any-data-with-a-latitude-and-longitude-on-a-map-98e01944b972
cook_map = gpd.read_file('cartodb-query.shp')
fig,ax = plt.subplots(figsize = (15,15))
cook_map.plot(ax = ax)
geometry=[Point(xy) for xy in zip(cook_county_top300.Longitude, cook_county_top300.Latitude)]
geo_df = gpd.GeoDataFrame(geometry = geometry)
print(geo_df)
g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Top 300')
plt.show()
                       geometry
0    POINT (-87.74947 41.74931)
1    POINT (-87.74617 42.11019)
2    POINT (-87.71695 42.09977)
3    POINT (-87.71283 42.09590)
4    POINT (-87.73599 42.11973)
..                          ...
295  POINT (-87.66717 41.95868)
296  POINT (-87.74789 42.11618)
297  POINT (-87.66328 41.95183)
298  POINT (-87.65001 41.91677)
299  POINT (-87.67095 42.03893)

[300 rows x 1 columns]
In [28]:
#Built an actual map since the different districts couldn't be seen on the base map above.
#Code was taken from this source: https://stackoverflow.com/questions/63644131/how-to-use-geopandas-to-plot-latitude-and-longitude-on-a-more-detailed-map-with
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point


ward = gpd.read_file('cartodb-query.shp', bbox=None, mask=None, rows=None)
geo_df = gpd.GeoDataFrame(geometry = geometry)

ward.crs = {'init':"epsg:4326"}
geo_df.crs = {'init':"epsg:4326"}

# plot the polygon
ax = ward.plot(alpha=0.35, color='#d66058', zorder=1)
# plot the boundary only (without fill), just uncomment
#ax = gpd.GeoSeries(ward.to_crs(epsg=3857)['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)
ax = gpd.GeoSeries(ward['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)

# plot the marker
ax = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Top 300', zorder=3)

ctx.add_basemap(ax, crs=geo_df.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
plt.show()
In [29]:
#Imported an image of the redlined districts to see the different historically redlined areas.
from IPython import display
display.Image("Redlining.png")

#Source: https://digitalchicagohistory.org/files/original/1868c31f074d1ace5e7cac8f5851d60d.png
Out[29]:
In [30]:
#Manually plotted the location of where the markers are most concentrated in the base map onto
#the redlining map. We see that the most expensive homes are concentrated in one area that was
#historically desirable and another area which was historically redlined
from PIL import Image
from pylab import *


im = array(Image.open('Redlining.png'))

# plot the image
imshow(im)

# some points
x = [1320,1700,1680,1670,1660, 1650]
y = [700,700,650,550,500, 480]

plot(x,y,'ro')

# add title and show the plot
title('Present day most-expensive homes mapped on districts from the 1930s')
show()
In [31]:
#Similar to above, looked at the lowest valued homes.
cook_county_bottom300=cook_county.sort_values('Sale Price', ascending=False).tail(300)
cook_county['Sale_Price']=cook_county['Sale Price']
cook_county['Sale_Price']
Out[31]:
0              1
1         285000
2          22000
3         225000
4          22600
           ...  
204787     37100
204788    225000
204789    135000
204790    392000
204791    125000
Name: Sale_Price, Length: 204792, dtype: int64
In [32]:
#There seem to be a lot of null values of 1. I am curious to see which locations these null values fall in
cook_county_null= cook_county['Sale Price']==1
cook_county_null_1=cook_county[cook_county_null]
cook_county_null_1
Out[32]:
PIN Property Class Neighborhood Code Land Square Feet Town Code Apartments Wall Material Roof Material Basement Basement Finish ... Sale Half of Year Most Recent Sale Age Decade Pure Market Filter Garage Indicator Neigborhood Code (mapping) Town and Neighborhood Description Lot Size Sale_Price
0 17294100610000 203 50 2500.0 76 0.0 2.0 1.0 1.0 3.0 ... 2 1.0 13.2 0 0.0 50 7650 This property, sold on 09/14/2015, is a one-st... 2500.0 1
5 19174010300000 203 380 4390.0 72 0.0 2.0 1.0 1.0 3.0 ... 2 1.0 5.8 0 1.0 380 72380 This property, sold on 07/26/2018, is a one-st... 4390.0 1
14 21312250180000 202 100 2976.0 70 0.0 1.0 1.0 1.0 3.0 ... 1 0.0 12.2 0 1.0 100 70100 This property, sold on 06/28/2017, is a one-st... 2976.0 1
20 3323080010000 203 70 6500.0 38 0.0 2.0 1.0 1.0 3.0 ... 1 1.0 9.4 0 0.0 70 3870 This property, sold on 02/13/2019, is a one-st... 6500.0 1
23 6171050140000 203 11 9113.0 18 0.0 1.0 1.0 1.0 3.0 ... 1 0.0 4.1 0 1.0 11 1811 This property, sold on 02/22/2019, is a one-st... 9113.0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
204757 16264040290000 202 115 3125.0 77 0.0 1.0 1.0 2.0 3.0 ... 2 1.0 11.7 0 1.0 115 77115 This property, sold on 08/10/2017, is a one-st... 3125.0 1
204765 13174290110000 203 90 4250.0 71 0.0 2.0 1.0 1.0 3.0 ... 1 1.0 6.6 0 1.0 90 7190 This property, sold on 04/18/2019, is a one-st... 4250.0 1
204766 25051260290000 205 282 4404.0 72 0.0 2.0 1.0 1.0 3.0 ... 1 0.0 6.7 0 1.0 282 72282 This property, sold on 06/28/2016, is a two-st... 4404.0 1
204773 3161170030000 278 40 8750.0 38 0.0 1.0 1.0 3.0 3.0 ... 2 1.0 3.4 0 1.0 40 3840 This property, sold on 11/18/2019, is a two-st... 8750.0 1
204782 23293040180000 208 52 29848.0 30 0.0 2.0 1.0 1.0 3.0 ... 1 1.0 2.0 0 1.0 52 3052 This property, sold on 06/18/2019, is a two-st... 29848.0 1

35546 rows × 63 columns

In [33]:
#Mapped all the null values and their corresponding locations on the map. 
#Can see that this is a huge are. However notice that the northern part of the City of Chicago
#does not have any null values. Can speak to better data quality in that area. Also happens 
#to be where the wealthier houses are.
cook_map = gpd.read_file('cartodb-query.shp')
fig,ax = plt.subplots(figsize = (15,15))
cook_map.plot(ax = ax)
geometry3=[Point(xy) for xy in zip(cook_county_null_1.Longitude, cook_county_null_1.Latitude)]
geo_df = gpd.GeoDataFrame(geometry = geometry3)
print(geo_df)
g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*')
plt.show()
                         geometry
0      POINT (-87.65426 41.84080)
1      POINT (-87.76966 41.78458)
2      POINT (-87.55556 41.74603)
3      POINT (-87.98510 42.07181)
4      POINT (-88.23988 42.04467)
...                           ...
35541  POINT (-87.70902 41.84374)
35542  POINT (-87.76924 41.95748)
35543  POINT (-87.65959 41.72984)
35544  POINT (-87.96293 42.12189)
35545  POINT (-87.88965 41.65979)

[35546 rows x 1 columns]
In [35]:
ward = gpd.read_file('cartodb-query.shp', bbox=None, mask=None, rows=None)
geo_df = gpd.GeoDataFrame(geometry = geometry3)

ward.crs = {'init':"epsg:4326"}
geo_df.crs = {'init':"epsg:4326"}

# plot the polygon
ax = ward.plot(alpha=0.35, color='#d66058', zorder=1)

ax = gpd.GeoSeries(ward['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)

# plot the marker
ax = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = '', zorder=3)

ctx.add_basemap(ax, crs=geo_df.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
plt.show()
In [36]:
#Had to clean the data a little, thousands of data points had a null entry represented by "1".
#Removed Null entries.
cook_county_clean=cook_county[cook_county.Sale_Price !=1]
cook_county_clean['Sale_Price']
Out[36]:
1         285000
2          22000
3         225000
4          22600
6         100000
           ...  
204787     37100
204788    225000
204789    135000
204790    392000
204791    125000
Name: Sale_Price, Length: 169246, dtype: int64
In [37]:
#Sorted according to sale price from least expensive.
cook_county_clean_bottom300=cook_county_clean.sort_values('Sale_Price', ascending=False).tail(300)
cook_county_clean_bottom300['Sale_Price']
Out[37]:
19474     100
182081    100
38862     100
52819     100
94113     100
         ... 
82174      10
77126      10
822         6
184342      4
36337       2
Name: Sale_Price, Length: 300, dtype: int64
In [38]:
#Similar to the first section, loaded the base map and plotted the least expensive homes onto it.
#Data shows that the spread is greater than the top 300 (huge property valuation disparity)
cook_map = gpd.read_file('cartodb-query.shp')
fig,ax = plt.subplots(figsize = (15,15))
cook_map.plot(ax = ax)
geometry2=[Point(xy) for xy in zip(cook_county_clean_bottom300.Longitude, cook_county_clean_bottom300.Latitude)]
geo_df = gpd.GeoDataFrame(geometry = geometry2)
print(geo_df)
g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Top 300')
plt.show()
                       geometry
0    POINT (-87.62409 41.62768)
1    POINT (-87.54236 41.62020)
2    POINT (-87.55964 41.74126)
3    POINT (-87.68628 41.77345)
4    POINT (-87.78374 41.78344)
..                          ...
295  POINT (-87.71545 41.80094)
296  POINT (-87.76010 41.94714)
297  POINT (-87.79178 41.78810)
298  POINT (-87.79178 41.78810)
299  POINT (-87.66764 41.78456)

[300 rows x 1 columns]
In [39]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point


ward = gpd.read_file('cartodb-query.shp', bbox=None, mask=None, rows=None)
geo_df = gpd.GeoDataFrame(geometry = geometry2)

ward.crs = {'init':"epsg:4326"}
geo_df.crs = {'init':"epsg:4326"}

# plot the polygon
ax = ward.plot(alpha=0.35, color='#d66058', zorder=1)

ax = gpd.GeoSeries(ward['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)

# plot the marker
ax = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Bottom 300', zorder=3)

ctx.add_basemap(ax, crs=geo_df.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
plt.show()
In [40]:
from IPython import display
display.Image("Redlining.png")
Out[40]:
In [41]:
#See that least expensive homes are in red and yellow areas but not anywhere green or blue which
#were the historically desirable areas.
from PIL import Image
from pylab import *

# read image to array
im = array(Image.open('Redlining.png'))

# plot the image
imshow(im)

# some points
x = [1250,1220,1220,1220,1350,1300,1250,1200,900,1100,1000,1050,1000]
y = [750,700,850,800,1200,1100,1250,1300,500,600,1000,1200,750]

# plot the points with red star-markers
plot(x,y,'ro')

# add title and show the plot
title('Present day least expensive homes mapped on districts from the 1930s')
show()
In [ ]: